The aim of this analysis is to render some results from the phos site conservation into a heatmap and maybe a network.
library(here)
## here() starts at /Volumes/HPC-Home/NCM_NT_1705_22022023_PHOS_SITE_CONS
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
df <- read_csv(here("results/merged_test_sites.csv"),
col_types = cols(
phos_site_id = col_character(),
mo_protein_id = col_character(),
species_compared = col_character(),
has_ortholog = col_logical(),
best_hit_ortholog = col_character(),
has_hit_in_ortholog = col_logical(),
best_hit_ortholog_score = col_double(),
best_hit_ortholog_identity = col_character(),
num_p_sites_in_mo = col_integer(),
num_sites_matched_in_ortholog = col_integer(),
sites_found_in_ortho = col_integer(),
mo_peptide = col_character(),
mo_seq_in_hit = col_character(),
orth_seq_in_hit = col_character(),
annotated_seq = col_character(),
mod_pattern = col_character()
)
) |>
transmute(phos_site_id, mo_protein_id, species_compared, has_ortholog, best_hit_ortholog, num_p_sites_in_mo, num_sites_matched_in_ortholog, prop_found = 100 * num_sites_matched_in_ortholog / num_p_sites_in_mo)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
As well as a summary cluster we want to figgify it, so get pretty names and lifestyle info.
trophism_info <- readr::read_csv(here("lib", "all_proteomes_lifestyles.csv")) |> rename("Saprotroph"="Saprophyte") |>
select( name, tag, Saprotroph, Endophyte, Symbiont, Commensal,Biotroph, Hemibiotroph, Necrotroph) |>
filter(tag != "Mo8") |>
pivot_longer(cols = c(Saprotroph, Endophyte, Symbiont, Commensal,Biotroph, Hemibiotroph, Necrotroph),names_to = "trophism", values_to = "trophism_val") |>
group_by(trophism, tag) |>
filter(trophism_val == TRUE) |>
summarise( trophism = first(trophism),
name = name
) |>
arrange(tag)
## Rows: 42 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): species_strain, name, Comment, version, tag, source, path, path2
## lgl (12): Saprophyte, Biotroph, Hemibiotroph, Necrotroph, Endophyte, Symbion...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'trophism'. You can override using the `.groups` argument.
appressorium_info <- readr::read_csv(here("lib", "all_proteomes_lifestyles.csv")) |>
rename("Hyaline" = "Hyaline Appressorium", "Compound"="compound appressorium", "Melanized"="Melanized Appressorium","None"="No appressorium") |>
select( name, tag, "Compound", "Hyaline", "Melanized", "None") |>
filter(tag != "Mo8") |>
pivot_longer(cols = c("Compound", "Hyaline", "Melanized", "None"), names_to = "appressorium", values_to = "appressorium_val") |>
group_by(appressorium, tag) |>
filter(appressorium_val == TRUE) |>
summarise( appressorium = first(appressorium),
name = name) |>
arrange(tag)
## Rows: 42 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): species_strain, name, Comment, version, tag, source, path, path2
## lgl (12): Saprophyte, Biotroph, Hemibiotroph, Necrotroph, Endophyte, Symbion...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'appressorium'. You can override using the `.groups` argument.
pmk1_info <- readr::read_csv(here("lib", "all_proteomes_lifestyles.csv")) |>
rename("pmk1_path"="Pmk1 pathogenicity") |>
filter(tag != "Mo8") |>
mutate("pmk1_path" = if_else(pmk1_path, "PMK1 Required", "PMK1 Not Required")) |>
select( name, tag, "pmk1_path" )
## Rows: 42 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): species_strain, name, Comment, version, tag, source, path, path2
## lgl (12): Saprophyte, Biotroph, Hemibiotroph, Necrotroph, Endophyte, Symbion...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- left_join(df, appressorium_info, by = c("species_compared"="tag"))
sort(unique(df$num_p_sites_in_mo))
## [1] 1 2 3
sort(unique(df$prop_found))
## [1] 0.00000 33.33333 50.00000 100.00000
sort(unique(df$num_sites_matched_in_ortholog))
## [1] 0 1
wide <- df |> pivot_wider(id_cols = phos_site_id,
names_from = name,
values_from = prop_found
)
tmp <- wide
phos_site_id <- tmp$phos_site_id
tmp$phos_site_id <- NULL
prop_mat <- data.matrix(tmp)
rownames(prop_mat) <- phos_site_id
#HACK! replace NO ORTHOLOG NA wit -1 for cluster
prop_mat[is.na(prop_mat)] <- -1
nr_prop_mat <- prop_mat[rowSums(prop_mat) != -41,]
Estimate the number of clusters in the rows - how do the phos-sites cluster
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(nr_prop_mat, kmeans, method="wss", k.max = 50)
## Warning: did not converge in 10 iterations
Not much improvement after 9. Will use that
kmeans_clust <- kmeans(nr_prop_mat, 9, nstart = 25, iter.max=1000)
cluster_sites <- tibble::tibble(
phos_site_id = names(kmeans_clust$cluster),
cluster_number = kmeans_clust$cluster
)
saveRDS(kmeans_clust, here("results/kmeans.RDS"))
annot_df <- df |> left_join(cluster_sites, by="phos_site_id")
readr::write_csv(annot_df, "cluster_gene_info.csv")
We want quite a lot out of this. First I want to know how many clusters the summary data has in the columns. IE how do the species cluster given the row clustering.
fviz_nbclust(t(kmeans_clust$centers), kmeans, method="wss", k.max =39)
About 7 or about 15 clusters
set.seed(123)
kmean_tmp <- ComplexHeatmap::Heatmap(kmeans_clust$centers)
species_order <- colnames(kmeans_clust$centers)#[order.dendrogram(ComplexHeatmap::column_dend(kmean_tmp))]
info <- tibble::tibble(
name = species_order
) |>
left_join(trophism_info, by = "name") |>
left_join(appressorium_info, by = "name") |>
left_join(pmk1_info, by = "name")
We calculated clusters in a Col-0 DMPK1 dataset previously. The
cluster 2 and 3 there were of interest and we’d like to see how the
phos-sites in those cluster map to the clusters here. We’ll need to
count phos-sites from those in each of the clusters here. The cluster
info to take from is in
raw/010_kmeans_cluster_3_on_guy11_and_dpmk1.csv and
010_kmeans_cluster_4_on_guy11_and_dpmk1.csv. The IDs will
need converting.
library(here)
library(dplyr)
convert_ids <- function(df) {
# convert site data from cluster in format "MGG_01258T0 [68-87]-2xPhospho [S7(100); T14(100)]"
# to site data in phos-cons format MGG_00111T0-S13-S16
# Note that the phos sites without positions
# "MGG_04185T0 [225-250]-1xPhospho [S/T]"
# come out as MGG_04185T0-NA so are not mappable to the IDs in the phos-cons data
# which are like MGG_00111T0-S13-S16
df <- df |>
mutate(gene = substr(id, 1, 11),
offset = stringr::str_extract(id, "\\[\\d+-\\d+\\]")
) |>
tidyr::separate_wider_delim(offset, names = c("start", "stop"), delim = "-") |>
mutate(
start = stringr::str_remove(start,"\\["),
start = as.integer(start),
stop = stringr::str_remove(stop, "\\]"),
stop = as.integer(stop),
sites = stringr::str_extract(id, "\\[[STY]\\d+.*\\]"),
sites = stringr::str_remove_all(sites, "\\(\\d+\\)"),
sites = stringr::str_remove_all(sites, "\\(\\d+\\.\\d+\\)"),
sites = stringr::str_remove_all(sites, "\\["),
sites = stringr::str_remove_all(sites, "\\]"),
sites = stringr::str_extract_all(sites, "[STY]\\d+"),
site_letters = purrr::map(sites, function(x){stringr::str_extract(x, "[STY]")}),
site_numbers = purrr::map(sites, function(x){as.integer(stringr::str_extract(x, "\\d+"))}),
proper_starts = purrr::map2(site_numbers, start, function(x,y){ (x + y)-1 }),
site_and_start = purrr::map2(proper_starts, site_letters, function(x,y){paste0(y,x)}),
sites = purrr::map_chr(site_and_start, function(x){paste0(x, collapse="-")}),
new_id = paste(gene, sites, sep="-")
) |>
select(id, gene, new_id)
}
clus1 <- readr::read_csv(here("raw/010_kmeans_cluster_1_on_guy11_and_dpmk1.csv")) |>
convert_ids()
## Rows: 1113 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): id
## dbl (12): dpmk1-0, Guy11-0, dpmk1-1, Guy11-1, dpmk1-1.5, Guy11-1.5, dpmk1-2,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
clus3 <- readr::read_csv(here("raw/010_kmeans_cluster_3_on_guy11_and_dpmk1.csv")) |>
convert_ids()
## Rows: 1372 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): id
## dbl (12): dpmk1-0, Guy11-0, dpmk1-1, Guy11-1, dpmk1-1.5, Guy11-1.5, dpmk1-2,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
clus4 <- readr::read_csv(here("raw/010_kmeans_cluster_4_on_guy11_and_dpmk1.csv")) |>
convert_ids()
## Rows: 1781 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): id
## dbl (12): dpmk1-0, Guy11-0, dpmk1-1, Guy11-1, dpmk1-1.5, Guy11-1.5, dpmk1-2,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Now loop through clusters of phos-cons data and work out how many of
clus3 and clus4 IDs are in each.
counts_in_clusters <- cluster_sites |>
group_by(cluster_number) |>
summarize( cluster_size = length(cluster_number),
num_in_clus1 = sum(clus1$new_id %in% phos_site_id),
prop_in_clus1 = num_in_clus1 / cluster_size,
clus1_size = 1113,
num_in_clus3 = sum(clus3$new_id %in% phos_site_id),
prop_in_clus3 = num_in_clus3 / cluster_size,
clus3_size = 1281,
num_in_clus4 = sum(clus4$new_id %in% phos_site_id),
prop_in_clus4 = num_in_clus4 / cluster_size,
clus4_size = 861,
total_phos_sites = nrow(prop_mat),
pclus1 = phyper(num_in_clus1, clus1_size, total_phos_sites - clus1_size, cluster_size ),
padj.clus1 = p.adjust(pclus1),
pclus3 = phyper(num_in_clus3, clus3_size, total_phos_sites - clus3_size, cluster_size ),
padj.clus3 = p.adjust(pclus3),
pclus4 = phyper(num_in_clus4, clus4_size, total_phos_sites - clus4_size, cluster_size ),
padj.clus4 = p.adjust(pclus4)
)
counts_in_clusters
## # A tibble: 9 × 18
## cluster_number cluster_size num_in_clus1 prop_in_clus1 clus1_size num_in_clus3
## <int> <int> <int> <dbl> <dbl> <int>
## 1 1 220 32 0.145 1113 48
## 2 2 279 62 0.222 1113 65
## 3 3 337 61 0.181 1113 84
## 4 4 96 23 0.240 1113 21
## 5 5 181 25 0.138 1113 43
## 6 6 225 50 0.222 1113 46
## 7 7 290 55 0.190 1113 67
## 8 8 1937 361 0.186 1113 461
## 9 9 109 24 0.220 1113 19
## # ℹ 12 more variables: prop_in_clus3 <dbl>, clus3_size <dbl>,
## # num_in_clus4 <int>, prop_in_clus4 <dbl>, clus4_size <dbl>,
## # total_phos_sites <int>, pclus1 <dbl>, padj.clus1 <dbl>, pclus3 <dbl>,
## # padj.clus3 <dbl>, pclus4 <dbl>, padj.clus4 <dbl>
readr::write_csv(counts_in_clusters, here("results/counts_in_outside_clusters.csv"))
Looks like a fairly low number of phos-sites
set.seed(123)
col_fun = circlize::colorRamp2(
c(-1,0,33, 50, 100),
c("gray", "white", "#FDE725FF", "#238A8DFF", "#440154FF")
)
troph_cols <- c("Biotroph" = "#b2df8a",
"Commensal" = "#e1d5c4",
"Endophyte" = "#80493C",
"Hemibiotroph" = "#33a02c",
"Necrotroph" = "#075C07",
"Saprotroph" = "#542519",
"Symbiont" = "yellow"
)
app_cols <- c("Compound" = "lightblue",
"Hyaline" = "pink",
"Melanized" = "brown",
"None" = "gray")
#app_cols <- c("Melanized" = "brown",
# "Non-melanized"= "wheat",
# "None" = "gray")
pmk1_cols <- c("PMK1 Required" = "red", "PMK1 Not Required" = "white")
kmean_summ <- ComplexHeatmap::Heatmap(kmeans_clust$centers,
col = col_fun,
bottom_annotation = ComplexHeatmap::HeatmapAnnotation(
Lifestyle = info$trophism,
Appressorium = info$appressorium,
PMK1 = info$pmk1_path,
col =list(Lifestyle=troph_cols,
Appressorium = app_cols,
PMK1=pmk1_cols)
),
left_annotation = ComplexHeatmap::rowAnnotation(
`Prop in Cluster 3` = counts_in_clusters$prop_in_clus3,
`Prop in Cluster 4` = counts_in_clusters$prop_in_clus4
),
show_heatmap_legend = FALSE)
lgd <- ComplexHeatmap::Legend(direction = "vertical",
col_fun = col_fun,
#labels = c("No hit", 0, 33, 50, 100),
title = "Percent target phos sites found")
col_order <- ComplexHeatmap::column_dend(kmean_summ)
## Warning: The heatmap has not been initialized. You might have different results
## if you repeatedly execute this function, e.g. when row_km/column_km was
## set. It is more suggested to do as `ht = draw(ht); column_dend(ht)`.
ComplexHeatmap::draw(kmean_summ,padding= grid::unit(c(0,0,0,3), "in"))
ComplexHeatmap::draw(lgd, x = grid::unit(12.5, "in"), y = grid::unit(4, "in"))
species_tree <- ComplexHeatmap::column_dend(kmean_summ)
## Warning: The heatmap has not been initialized. You might have different results
## if you repeatedly execute this function, e.g. when row_km/column_km was
## set. It is more suggested to do as `ht = draw(ht); column_dend(ht)`.
saveRDS(species_tree, here::here("results/heatmap_species_dendro.RDS"))
Number of phos sites in each cluster
table(kmeans_clust$cluster)
##
## 1 2 3 4 5 6 7 8 9
## 220 279 337 96 181 225 290 1937 109
biggest_cluster <- which.max(kmeans_clust$size) #assume biggest cluster is all the empty hits
for (i in 1:max(kmeans_clust$cluster)) {
if (i != biggest_cluster) {
rows <- kmeans_clust$clust == i
mini_mat <- nr_prop_mat[rows,]
#print(ncol(mini_mat))
mini_hm <- ComplexHeatmap::Heatmap(mini_mat,
col = col_fun,
column_order = order.dendrogram(col_order),
show_heatmap_legend = FALSE)
lgd <- ComplexHeatmap::Legend(direction = "vertical",
col_fun = col_fun,
#labels = c("No hit", 0, 50, 100),
title = paste0("Cluster ", i, " Percent target phos sites found")
)
ComplexHeatmap::draw(mini_hm,padding= grid::unit(c(0,0,0,3), "in"))
ComplexHeatmap::draw(lgd, x = grid::unit(10.5, "in"), y = grid::unit(4, "in"))
}
}